Filter criteria:

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 2928
## Number of samples: 86 (51 ASD, 35 controls)

Dimensionality reduction using PCA

First principal component explains over 89.7% of the total variance (lower than 90%…)

reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
  
  datExpr = data.frame(datExpr)
  
  datExpr_pca = prcomp(datExpr, scale.=TRUE)
  last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>% 
    filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
  
  par(mfrow=c(1,2))
  plot(summary(datExpr_pca)$importance[2,], type='b')
  abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
  plot(summary(datExpr_pca)$importance[3,], type='b')
  abline(h=var_explained, col='blue')
  
  
  print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
             var_explained*100, '% of the variance'))
  
  datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
  
  return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}

reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)

## Keeping top 11 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output

rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)

Perhaps the first component is determined by a small number of samples and by removing them we can lower the variance explained by it:

plot(sort(pca_output$rotation[,1], decreasing = TRUE))

Genes are still separated into two clouds of points:

datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()

Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.

load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')

# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]

# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
  
ASD_pvals = ordered_top_genes$adj.P.Val
log_fold_change = ordered_top_genes$logFC

pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()
p2 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=log_fold_change)) + geom_point() + theme_minimal() +
  scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)

rm(keep, mod, corfit, lmfit, fit, ASD_pvals, log_fold_change, top_genes, ordered_top_genes, datExpr, datProbes, 
   datSeq, p1, p2)

SFARI genes dataset

Only 165 of our 2928 genes appear on the SFARI list, losing all 25 genes with a score of 1

SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv', 
    col_types = cols(`number-of-reports` = col_integer(), syndromic = col_logical()))

print('Gene Score count of all genes:')
## [1] "Gene Score count of all genes:"
SFARI_genes$`gene-score` %>% table
## .
##   1   2   3   4   5   6 
##  25  62 195 454 175  25
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19

gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                   values=SFARI_genes$`gene-symbol`, mart=mart) %>% mutate('gene-symbol'=hgnc_symbol) %>% 
                   dplyr::select('ensembl_gene_id', 'gene-symbol')

SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')

gene_scores = SFARI_genes %>% dplyr::select(ensembl_gene_id, `gene-score`, syndromic) %>% 
              right_join(gene_names, by='ensembl_gene_id') %>% dplyr::select(-'gene-symbol')

# Full (ordered) list of datExpr genes with their scores
gene_scores = data.frame('ensembl_gene_id'=rownames(datExpr_redDim)) %>% 
              left_join(gene_scores, by = 'ensembl_gene_id') %>%
              mutate('syndromic' = ifelse(syndromic==FALSE | is.na(syndromic), FALSE, TRUE),
                     'gene-bool' = ifelse(is.na(`gene-score`), FALSE, TRUE))

'Gene score count of genes in datExpr:'
## [1] "Gene score count of genes in datExpr:"
gene_scores$`gene-score` %>% table
## .
##  2  3  4  5  6 
##  5 31 66 40  4
rm(SFARI_genes, mart, gene_names)

Clustering

clusterings = list()

clusterings[['SFARI_score']] = gene_scores$`gene-score`
names(clusterings[['SFARI_score']]) = gene_scores$ensembl_gene_id

clusterings[['SFARI_bool']] = gene_scores$`gene-bool`
names(clusterings[['SFARI_bool']]) = gene_scores$ensembl_gene_id

clusterings[['syndromic']] = gene_scores$syndromic
names(clusterings[['syndromic']]) = gene_scores$ensembl_gene_id

K-means Clustering

set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
                                      algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster

Hierarchical Clustering

Chose k=6 as best number of clusters.

Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.

Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.

h_clusts = datExpr_redDim %>% dist %>% hclust
# plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 6
clusterings[['hc']] = cutree(h_clusts, best_k)

create_viridis_dict = function(){
  min_score = clusterings[['SFARI_score']] %>% min(na.rm=TRUE)
  max_score = clusterings[['SFARI_score']] %>% max(na.rm=TRUE)
  viridis_score_cols = viridis(max_score - min_score + 1)
  names(viridis_score_cols) = seq(min_score, max_score)
  
  return(viridis_score_cols)
}

viridis_score_cols = create_viridis_dict()

dend_meta = gene_scores[match(labels(h_clusts), gene_scores$ensembl_gene_id),] %>% 
            mutate('SFARI_score' = viridis_score_cols[`gene-score`],                   # Purple: 2, Yellow: 6
                   'SFARI_bool' = ifelse(`gene-bool` == T, '#21908CFF', 'white'), # Acqua
                   'Syndromic' = ifelse(syndromic == T, 'orange', 'white')) %>% 
            dplyr::select(SFARI_score, SFARI_bool, Syndromic)

h_clusts %>% as.dendrogram %>% set('labels', rep('', nrow(datMeta))) %>% 
             set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)

Consensus Clustering

Samples are grouped into two big clusters and two outliers, the first big cluster has three main subclusters, two small subclusters and two outliers, and the second one two main subclustersand five small subclusters.

*Output plots in clustering_genes_03_20 folder

Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign genes to clusters with FDR<0.01 using the fdrtool package

ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]

clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c

clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])

Leaves most of the observations (~80%) without a cluster (every time it leaves out more genes):

ICA_clusters %>% rowSums %>% table
## .
##    0    1    2    3    4    5 
## 2355  425  112   27    8    1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) + 
  geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
  theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()

WGCNA

best_power=56 but blockwiseModules only accepts powers up to 30, so 30 was used instead

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(20, 60, by=2))
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1     20    0.238 -0.225          0.940     516       517   1130
## 2     22    0.302 -0.248          0.933     478       463   1080
## 3     24    0.366 -0.272          0.942     446       417   1040
## 4     26    0.436 -0.293          0.958     417       376    998
## 5     28    0.485 -0.314          0.949     391       339    963
## 6     30    0.522 -0.330          0.945     368       309    931
## 7     32    0.578 -0.352          0.967     347       282    901
## 8     34    0.625 -0.371          0.965     328       258    873
## 9     36    0.645 -0.384          0.954     311       236    847
## 10    38    0.688 -0.402          0.946     295       215    822
## 11    40    0.715 -0.415          0.938     281       200    799
## 12    42    0.740 -0.426          0.935     267       185    778
## 13    44    0.763 -0.442          0.930     255       172    757
## 14    46    0.787 -0.457          0.936     244       159    738
## 15    48    0.802 -0.469          0.933     233       148    719
## 16    50    0.809 -0.480          0.923     223       138    702
## 17    52    0.823 -0.493          0.924     214       128    685
## 18    54    0.839 -0.503          0.929     206       119    669
## 19    56    0.854 -0.515          0.933     198       111    654
## 20    58    0.858 -0.527          0.926     190       104    640
## 21    60    0.872 -0.534          0.937     183        98    626
network = datExpr_redDim %>% t %>% blockwiseModules(power=30, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

It leaves 304 genes without a cluster:

clusterings[['WGCNA']] %>% table
## .
##    0    1    2    3    4 
##  304 1742  528  227  127

Gaussian Mixture Models with hard thresholding

Number of clusters that resemble more Gaussian mixtures = 39

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 39
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())

Trying with 27 clusters (27 has the 9th lowest value)

best_k = 27
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 159  71 159 165 109 122  96 106 135  78  54 109 154  90 146  71 102  40 
##  19  20  21  22  23  24  25  26  27 
## 160 159  99  87  55  98  82  67 155

Trying with 6 clusters

best_k = 6
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_3']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_3']] %>% table
## .
##   1   2   3   4   5   6 
## 662 455 366 684 431 330

Manual clustering

Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.

# Modify equation sign if needed so blue cluter represents overexpressed genes
manual_clusters = as.factor(as.numeric(-0.1*datExpr_redDim$PC1 + 0.05 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() + 
  geom_abline(slope=-0.1, intercept=0.05, color='gray') + theme_minimal()

names(manual_clusters) = rownames(datExpr_redDim)

clusterings[['Manual']] = manual_clusters

clusterings[['Manual']] %>% table
## .
##    0    1 
## 1713 1215

Cluster 2 has a bigger mean expression than cluster 1. There also seem to be more than one distributions in cluster 2, with two different means and three different standard deviations.

manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd), 
                             manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)

Separating cluster 2 genes by their sd: It seems that the bigger the sd of the cluster the genes belong to, the bigger their (absolute) value is in the first PC component (blue samples are really close to 0, and the purple ones are between 10 and 20). The second PC also seems to play a small part on this.

gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

n_clusters = 3
c2_sd = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(sd)
rownames(c2_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_sd = c2_sd %>% GMM(n_clusters)

clusterings[['Manual_sd']] = as.character(clusterings[['Manual']])
clusterings[['Manual_sd']][clusterings[['Manual']]==1] = gmm_c2_sd$Log_likelihood %>% 
  apply(1, function(x) glue('1_',which.max(x)))

plot_gaussians = manual_clusters_data %>% ggplot(aes(x=sd)) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[2], # green
                args=list(mean=gmm_c2_sd$centroids[1], sd=gmm_c2_sd$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[3], # acqua
                args=list(mean=gmm_c2_sd$centroids[2], sd=gmm_c2_sd$covariance_matrices[2])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[4], # acqua
                args=list(mean=gmm_c2_sd$centroids[3], sd=gmm_c2_sd$covariance_matrices[3])) +
  theme_minimal()

plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_sd']]))) + 
          geom_point() + theme_minimal()

grid.arrange(plot_gaussians, plot_points, ncol=2)

clusterings[['Manual_sd']] %>% table
## .
##    0  1_1  1_2  1_3 
## 1713  422  436  357

Separating cluster 2 genes by their mean:

n_clusters = 2

c2_mean = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(mean)
rownames(c2_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_mean= c2_mean %>% GMM(n_clusters)

clusterings[['Manual_mean']] = as.character(clusterings[['Manual']])
clusterings[['Manual_mean']][clusterings[['Manual']]==1] = gmm_c2_mean$Log_likelihood %>% 
  apply(1, function(x) glue('1_',which.max(x)))

plot_gaussians = manual_clusters_data %>% ggplot(aes(x=mean)) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[2], # green
                args=list(mean=gmm_c2_mean$centroids[1], sd=gmm_c2_mean$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(n_clusters+1)[3], # blue
                args=list(mean=gmm_c2_mean$centroids[2], sd=gmm_c2_mean$covariance_matrices[2])) +
  theme_minimal()

plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_mean']]))) + 
          geom_point() + theme_minimal()

grid.arrange(plot_gaussians, plot_points, ncol=2)

clusterings[['Manual_mean']] %>% table
## .
##    0  1_1  1_2 
## 1713  622  593
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output, 
   ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, 
   best_power, c, manual_clusters, manual_clusters_data, c2_sd, c2_mean, gmm_c2_sd, gmm_c2_mean,
   p1, p2, pca_data_projection, dend_meta, plot_gaussians, plot_points, n_clusters, 
   viridis_score_cols, gg_colour_hue, create_viridis_dict)

Compare clusterings

Using Adjusted Rand Index:

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = as.factor(clusterings[[i]])
  for(j in (i):length(clusterings)){
    cluster2 = as.factor(clusterings[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, cluster_sim)

Scatter plots

create_3D_plot = function(cat_var){
  plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>% 
    layout(title = glue('Samples coloured by ', cat_var),
           scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
                        yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
                        zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))  
}

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                           km_clust = as.factor(clusterings[['km']]),
                hc_clust = as.factor(clusterings[['hc']]),       cc_l1_clust = as.factor(clusterings[['cc_l1']]),
                cc_clust = as.factor(clusterings[['cc_l2']]),    ica_clust = as.factor(clusterings[['ICA_min']]),
                n_ica_clust = as.factor(rowSums(ICA_clusters)),  gmm_clust = as.factor(clusterings[['GMM']]),
                gmm_clust2 = as.factor(clusterings[['GMM_2']]),  gmm_clust3 = as.factor(clusterings[['GMM_3']]),
                wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]),
                manual2_clust = as.factor(clusterings[['Manual_mean']]),
                manual3_clust = as.factor(clusterings[['Manual_sd']]),
                SFARI_clust = as.factor(clusterings[['SFARI_score']]),
                SFARI_bool = as.factor(clusterings[['SFARI_bool']]),
                syndromic = as.factor(clusterings[['syndromic']]))

2D plots of clusterings

  • Simple methods seem to only partition the space in buckets using information from the first component

  • All clusterings except K-means manage to separate the two clouds at least partially, but no one does a good job

  • WGCNA creates clusters inverted between clouds

  • Manual classification + GMM by standard deviation clusters make sense

  • SFARI genes seem to be everywhere

selectable_scatter_plot(plot_points, plot_points)

3D plots

create_3D_plot('ica_clust')
create_3D_plot('gmm_clust')
create_3D_plot('gmm_clust3')
create_3D_plot('wgcna_clust')
create_3D_plot('manual2_clust')